Introduction

This notebook looks explores the data from the DSRCT sample set, SCPCP000013. We then see if we can use expression of DSRCT-specific genes to manually classify tumor and normal cells. The main goal of this notebook is only to identify tumor cells, identification and labeling of the other cells is a separate question that we do not answer here.

Setup

suppressPackageStartupMessages({
  # load required packages
  library(SingleCellExperiment)
  library(ggplot2)
})
## Warning: package 'matrixStats' was built under R version 4.4.1
## Warning: package 'GenomicRanges' was built under R version 4.4.1
## Warning: package 'S4Vectors' was built under R version 4.4.1
## Warning: package 'IRanges' was built under R version 4.4.1
# Set default ggplot theme
theme_set(
  theme_bw()
)
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "2024-07-08")
#sample_dir <- file.path(data_dir, "SCPCP000013", params$sample_id)

# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-DSRCT")
metadata_file <- file.path(data_dir, "SCPCP000013", "single_cell_metadata.tsv")
metadata <- read.csv(metadata_file,  sep = '\t')
metadata_DSRCT <- dplyr::filter(metadata, diagnosis == 'Desmoplastic small round cell tumor')
sce_file_list <- file.path(
  data_dir, "SCPCP000013",  
  metadata_DSRCT$scpca_sample_id, 
  paste0(metadata_DSRCT$scpca_library_id, "_processed.rds")
)

marker_genes <- file.path(module_base, "references", "tumor-marker-genes.tsv")

# output tumor/normal classifications
results_dir <- file.path(module_base, "results", "marker_gene_analysis")
fs::dir_create(results_dir)

#classifications_filename <- glue::glue("{params$library_id}_tumor_normal_classifications.tsv")
#output_classifications_file <- file.path(results_dir, classifications_filename)

Read in each data as a separate object SingleCellExperiment in the list.

sce_list <- lapply(sce_file_list, readr::read_rds)

#adding the sample id to the name of each SingleCellExperiment
names(sce_list) <- metadata_DSRCT$scpca_library_id

# read in marker genes table 
marker_genes_df <- readr::read_tsv(marker_genes) |> 
  # account for genes being from multiple sources
  dplyr::select(cell_type, ensembl_gene_id, gene_symbol) |> 
  dplyr::distinct()
## Rows: 7 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): cell_type, gene_symbol, ensembl_gene_id, source
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
marker_genes_df
## # A tibble: 7 × 3
##   cell_type ensembl_gene_id gene_symbol
##   <chr>     <chr>           <chr>      
## 1 tumor     ENSG00000117069 ST6GALNAC5 
## 2 tumor     ENSG00000007402 CACNA2D2   
## 3 tumor     ENSG00000283154 IQCJ-SCHIP1
## 4 tumor     ENSG00000139304 PTPRQ      
## 5 tumor     ENSG00000069482 GAL        
## 6 tumor     ENSG00000197487 GALP       
## 7 tumor     ENSG00000165474 GJB2
marker_genes <- marker_genes_df |> 
  dplyr::filter(cell_type == "tumor") |> 
  dplyr::pull(ensembl_gene_id)

Analysis content

Explore marker gene expression

The first thing we do here is just create a faceted UMAP showing the expression of each marker gene for tumor cells.

umap_df_list <- lapply(sce_list, function(x) {
  # get the gene expression counts for all marker genes
  marker_gene_exp <- logcounts(x[marker_genes, ]) |>
    as.matrix() |>
    t() |>
    as.data.frame() |>
    tibble::rownames_to_column("barcodes")
  
  # pull out the UMAP coordinates and make a data frame to use for plotting
  umap_df <- x |>
    scuttle::makePerCellDF(use.dimred = "UMAP") |>
    # replace UMAP.1 with UMAP1
    dplyr::rename_with(\(x) stringr::str_replace(x, "^UMAP\\.", "UMAP")) |>
    # add in marker gene expression to dataframe
    dplyr::left_join(marker_gene_exp, by = "barcodes") |>
    # combine all genes into a single column for easy faceting
    tidyr::pivot_longer(cols = starts_with("ENSG"),
                        names_to = "ensembl_gene_id",
                        values_to = "gene_expression") |>
    # join with marker gene df to get gene symbols for plotting
    dplyr::left_join(marker_genes_df, by = c("ensembl_gene_id")) |>
    dplyr::select(barcodes,
                  UMAP1,
                  UMAP2,
                  gene_symbol,
                  ensembl_gene_id,
                  gene_expression,
                  cluster)
})

names(umap_df_list) <- metadata_DSRCT$scpca_sample_id
for(i in 1:length(umap_df_list)){
  # faceted umap showing a umap panel for each marker gene
  p <- ggplot(umap_df_list[[i]], aes(x = UMAP1, y = UMAP2, color = gene_expression)) +
    geom_point(alpha = 0.8, size = 0.1) +
    facet_wrap(vars(gene_symbol)) +
    scale_color_viridis_c() +
    labs(color = "Log-normalized gene expression") +
    # remove axis numbers and background grid
    scale_x_continuous(labels = NULL, breaks = NULL) +
    scale_y_continuous(labels = NULL, breaks = NULL) +
    theme(
      aspect.ratio = 1,
      legend.position = "bottom",
      axis.title = element_text(size = 9, color = "black"),
      strip.text = element_text(size = 8),
      legend.title = element_text(size = 9),
      legend.text = element_text(size = 8)
    ) +
    guides(colour = guide_colorbar(title.position = "bottom", title.hjust = 0.5)) +
    ggtitle(names(umap_df_list)[i])
  
  tiff(filename = paste0('../plots/marker_expression_', names(umap_df_list)[i], '.tiff'), width = 6, height = 6, units = 'in', res = 150)
  print(p)
  dev.off()
}

In my experience, ST6GALNAC5 is a strong marker for DSRCT in single-cell data. As can be seen, several of the samples have expression of ST6GALNAC5, but some do not, such as SCPCS000731 and SCPCS000729. In fact, these SCPCS000729 contains low number of cells. These samples in particular are the PDX samples and I will be excluding SCPCS000729 from further analyses.

umap_df_list_excluded <- umap_df_list[!(names(umap_df_list) %in% c('SCPCS000731'))]

We can also look at the distributions for each marker gene. I would expect to see some sort of bimodal distribution separating cells that do and do not have expression of the marker gene. What is clear from these plots that ST6GALNAC5, CACNA2D2, PTPRQ, IQCJ-SCHIP1, show a bi modal distribution among the cells. To some extent, we do see expression of the other markers.

for(i in 1:length(umap_df_list_excluded)) {
  p <- ggplot(umap_df_list_excluded[[i]],
              aes(x = gene_expression, fill = gene_symbol)) +
    geom_density() +
    facet_wrap(vars(gene_symbol), scales = 'free_y') +
    theme(legend.position = "none") + 
    ggtitle(names(umap_df_list_excluded)[i])
  print(p)
  
  #save plots
  tiff(filename = paste0('../plots/marker_distribution_', names(umap_df_list)[i], '.tiff'), width = 6, height = 6, units = 'in', res = 150)
  print(p)
  dev.off()
}

Although we see some slight semblance of a bimodal distribution for most marker genes, it is hard to see and we cannot directly compare gene expression values across genes. This would make it hard to identify a cut off to categorize cells as expression or not expressing the marker gene.

Now we will transform each of the gene expression vectors by generating z-scores. Then we might be able to find a cut off we can use across samples for if marker genes are present in a cell or not.

umap_df_list_excluded_scaled <- lapply(umap_df_list_excluded, function(x){
  x   |>
  dplyr::group_by(gene_symbol) |>
  # get z-scores for each gene
  dplyr::mutate(transformed_gene_expression = scale(gene_expression)[, 1]) |>
  dplyr::ungroup()
})

Now we can create the same density plot but looking at z-scores. Interestingly, in some samples, like SPCS000489, we see that ST6GALNAC5 has negative values. Presumably, these are tumor cells but there may be a skew due to the number of tumor cells compared to normal.

for(i in 1:length(umap_df_list_excluded_scaled)) {
  p <- ggplot(umap_df_list_excluded_scaled[[i]],
              aes(x = transformed_gene_expression, fill = gene_symbol)) +
    geom_density() +
    facet_wrap(vars(gene_symbol), scales = 'free_y') +
    theme(legend.position = "none") + 
    ggtitle(names(umap_df_list_excluded_scaled)[i])
  
  print(p)
  
  #save plots
  tiff(filename = paste0('../plots/marker_distribution_zscaled_', names(umap_df_list)[i], '.tiff'), width = 6, height = 6, units = 'in', res = 150)
  print(p)
  dev.off()
  
  
}
## Warning: Removed 1324 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 1324 rows containing non-finite outside the scale range
## (`stat_density()`).

## Warning: Removed 6140 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 6140 rows containing non-finite outside the scale range
## (`stat_density()`).

It looks like some marker genes have distinct groups of cells with z-score > 0, while other marker genes may not be as informative.

Classify tumor cells using marker genes only

Let’s try and use the marker gene expression to classify tumor cells. It looks like we could use a cutoff of z-score > 0 to count that cell as a tumor cell.

We could either count any cell that expresses at least one marker gene > 0 as a tumor cell, or look at the combined expression. Let’s start with classifying tumor cells as tumor if any marker gene is present (z-score > 0). This also means the sum of all z-scores > 0.

Below, we can get the sum of the transformed gene expression of all marker genes and plot in a single UMAP.

# calculate sum gene expression across all marker genes in list 
marker_sum_exp <- lapply(umap_df_list_excluded_scaled, function(x){
  x |>
  dplyr::group_by(barcodes) |> 
  dplyr::mutate(sum_exp = sum(transformed_gene_expression, na.rm = T)) |> 
  dplyr::select(barcodes, UMAP1, UMAP2, sum_exp, cluster) |> 
  dplyr::distinct()
})


# plot mean gene expression 

for(i in 1:length(marker_sum_exp)) {
  p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_exp)) +
  geom_point(size = 0.5, alpha = 0.5) +
  scale_color_viridis_c() +
    ggtitle(names(umap_df_list_excluded_scaled)[i])
  print(p)
  
  #save plots
  ggsave(
    filename = paste0(
      '../plots/UMAP_marker_expression_',
      names(umap_df_list)[i],
      '.png'
    ),
    plot = p,
    width = 6,
    height = 6,
    units = 'in',
    dpi = 150
  )
}

Similar to the individual plots, it looks like there is one group of cells on the bottom right that has the highest marker gene expression. We would anticipate that these are most likely to be the tumor cells.

Now let’s classify any cell that has a sum of marker genes > 0 (after z-transformation) as tumor cells.

# classify tumor cells based on presence of any marker genes 
marker_sum_exp <- lapply(marker_sum_exp, function(x){
  x |> 
  dplyr::mutate(sum_classification = dplyr::if_else(sum_exp > 0, "Tumor", "Normal"))})


for(i in 1:length(marker_sum_exp)) {
  p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_classification)) +
  geom_point(size = 0.5, alpha = 1) +
  ggtitle(names(umap_df_list_excluded_scaled)[i])
  print(p)
  
  #save plots
  ggsave(
    filename = paste0(
      '../plots/UMAP_classification_',
      names(umap_df_list)[i],
      '.png'
    ),
    plot = p,
    width = 6,
    height = 6,
    units = 'in',
    dpi = 150
  )
}

This gives us a rough idea of cells that may be classified as tumor cells. However, I believe re-doing this analysis using the cluster level data may give us better results. This is highly dependent on whether the clusters were generated with enough granularity.

# calculate sum gene expression across all marker genes in list 
marker_sum_exp <- lapply(umap_df_list_excluded_scaled, function(x){
  x |> 
  dplyr::group_by(cluster) |> #change to cluster
  dplyr::mutate(sum_exp = sum(transformed_gene_expression, na.rm = T)) |> 
  dplyr::select(barcodes, UMAP1, UMAP2, sum_exp, cluster) |> 
  dplyr::distinct()
})


# plot mean gene expression 

for(i in 1:length(marker_sum_exp)) {
  p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_exp)) +
  geom_point(size = 0.5, alpha = 0.5) +
  scale_color_viridis_c() +
    ggtitle(names(umap_df_list_excluded_scaled)[i])
  print(p)
  
  #save plots
  ggsave(
    filename = paste0('../plots/UMAP_marker_expression_cluster_', names(umap_df_list)[i], '.png'),
    plot = p,
    width = 6,
    height = 6,
    units = 'in',
    dpi = 150
  )
}

It looks like there are group of cells with high marker gene expression. We anticipate that these are most likely to be the tumor cells. The groups with low or negative values are likely normal cells.

Now let’s classify clusters that has a sum of marker genes > 0 (after z-transformation) as tumor cell clusters.

# classify tumor cells based on presence of any marker genes 
marker_sum_exp <- lapply(marker_sum_exp, function(x){
  d <- density(x$sum_exp)
  
  x |> 
  dplyr::mutate(sum_classification = dplyr::if_else(sum_exp > 0, "Tumor", "Normal"))})


for(i in 1:length(marker_sum_exp)) {
  p <- ggplot(marker_sum_exp[[i]], aes(x = UMAP1, y = UMAP2, color = sum_classification)) +
  geom_point(size = 0.5, alpha = 1) +
  ggtitle(names(umap_df_list_excluded_scaled)[i])
  print(p)

  #save plots
  ggsave(
    filename = paste0(
      '../plots/UMAP_classification_cluster_',
      names(umap_df_list)[i],
      '.png'
    ),
    plot = p,
    width = 6,
    height = 6,
    units = 'in',
    dpi = 150
  )

}

Based on my experiences, the above plots show that we are unable to adequately determine the tumor cells from the normal cells. This can be seen in SCPCS000729, which is most likely all tumor cells, since it is collected from a patient-derived xenograft.

Conclusions

Save outputs

# get an RDS of the processed data
saveRDS(sce_file_list, '../results/SCPCP000013_sce_file_list.rds')

Session Info

# record the versions of the packages used in this analysis and other environment information
sessionInfo()
## R version 4.4.0 (2024-04-24)
## Platform: aarch64-apple-darwin20
## Running under: macOS Ventura 13.6.8
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggplot2_3.5.1               SingleCellExperiment_1.26.0
##  [3] SummarizedExperiment_1.34.0 Biobase_2.64.0             
##  [5] GenomicRanges_1.56.2        GenomeInfoDb_1.40.1        
##  [7] IRanges_2.38.1              S4Vectors_0.42.1           
##  [9] BiocGenerics_0.50.0         MatrixGenerics_1.16.0      
## [11] matrixStats_1.4.1          
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.5              xfun_0.48                
##  [3] bslib_0.8.0               lattice_0.22-6           
##  [5] tzdb_0.4.0                vctrs_0.6.5              
##  [7] tools_4.4.0               generics_0.1.3           
##  [9] parallel_4.4.0            tibble_3.2.1             
## [11] fansi_1.0.6               highr_0.11               
## [13] pkgconfig_2.0.3           Matrix_1.7-0             
## [15] sparseMatrixStats_1.16.0  lifecycle_1.0.4          
## [17] GenomeInfoDbData_1.2.12   farver_2.1.2             
## [19] stringr_1.5.1             compiler_4.4.0           
## [21] textshaping_0.4.0         munsell_0.5.1            
## [23] codetools_0.2-20          htmltools_0.5.8.1        
## [25] sass_0.4.9                yaml_2.3.10              
## [27] pillar_1.9.0              crayon_1.5.3             
## [29] jquerylib_0.1.4           tidyr_1.3.1              
## [31] BiocParallel_1.38.0       DelayedArray_0.30.1      
## [33] cachem_1.1.0              abind_1.4-8              
## [35] tidyselect_1.2.1          digest_0.6.37            
## [37] stringi_1.8.4             purrr_1.0.2              
## [39] dplyr_1.1.4               labeling_0.4.3           
## [41] rprojroot_2.0.4           fastmap_1.2.0            
## [43] grid_4.4.0                colorspace_2.1-1         
## [45] cli_3.6.3                 SparseArray_1.4.8        
## [47] magrittr_2.0.3            S4Arrays_1.4.1           
## [49] utf8_1.2.4                readr_2.1.5              
## [51] withr_3.0.1               DelayedMatrixStats_1.26.0
## [53] scales_1.3.0              UCSC.utils_1.0.0         
## [55] bit64_4.5.2               rmarkdown_2.28           
## [57] XVector_0.44.0            httr_1.4.7               
## [59] bit_4.5.0                 ragg_1.3.3               
## [61] hms_1.1.3                 beachmat_2.20.0          
## [63] evaluate_1.0.1            knitr_1.48               
## [65] viridisLite_0.4.2         rlang_1.1.4              
## [67] Rcpp_1.0.13               scuttle_1.14.0           
## [69] glue_1.8.0                rstudioapi_0.17.0        
## [71] vroom_1.6.5               jsonlite_1.8.9           
## [73] R6_2.5.1                  systemfonts_1.1.0        
## [75] fs_1.6.4                  zlibbioc_1.50.0